David silver 强化学习 Easy21
课程主页: https://www.davidsilver.uk/teaching/
这里回顾David silver强化学习的作业Easy21。
参考资料
https://github.com/hartikainen/easy21
https://github.com/mari-linhares/easy21
1 Implementation of Easy21
实现环境
import numpy as np
dealer = [[1, 4], [4, 7], [7, 10]]
player = [[1, 6], [4, 9], [7, 12], [10, 15], [13, 18], [16, 21]]
action = [0, 1]
def update(s, i):
"""
Parameters
----------
s : state array
i : index
Returns
-------
None.
"""
#生成概率
p = np.random.rand()
v = np.random.randint(1, 11)
if p < 1 / 3:
s[i] -= v
else:
s[i] += v
def init():
return np.random.randint(1, 11, 2)
def judge(s, i):
#判断是否结束
if s[i] > 21 or s[i] < 1:
return True
return False
def get_action():
return np.random.randint(2)
class Easy21():
def step(self, s, a):
"""
Parameters
----------
s : state array, dealer s[0], player s[1]
a : action, hit: 0; stick: 1
Returns
-------
next state s'
reward r
terminal flag
"""
#判断是否结束
flag = False
r = 0
if a == 0:
update(s, 1)
if judge(s, 1):
r = -1
flag = True
else:
flag = True
while s[0] < 17 and s[0] > 0:
update(s, 0)
if judge(s, 0):
r = 1
else:
if s[0] < s[1]:
r = 1
elif s[0] == s[1]:
r = 0
else:
r = -1
return s, r, flag
def epsilon_greedy(Q, N, key, N0):
n = N[key].sum()
epsilon = N0 / (N0 + n)
p = np.random.rand()
if p >= epsilon:
a = np.argmax(Q[key])
else:
a = get_action()
return a
def get_feature(s, a):
feature = []
for d in dealer:
for p in player:
for act in action:
if s[0] >= d[0] and s[0] <= d[1] and s[1] >= p[0] and s[1] <= p[1] and a == act:
feature.append(1)
else:
feature.append(0)
return np.array(feature)
def greedy(theta, s, epsilon):
p = np.random.rand()
f1 = get_feature(s, 0)
f2 = get_feature(s, 1)
v = [np.dot(theta, f1), np.dot(theta, f2)]
if p >= epsilon:
a = np.argmax(v)
else:
a = get_action()
return a
def error_square(Q1, Q2):
error = 0
for key in Q1:
error += (np.max(Q1[key]) - np.max(Q2[key])) ** 2;
return error
def get_dict():
X = np.arange(1, 11)
Y = np.arange(1, 22)
res = dict()
for x in X:
for y in Y:
res[(x, y)] = np.array([0.0, 0.0])
return res
2 Monte-Carlo Control in Easy21
更新公式为
from env import *
import numpy as np
import pickle
gamma = 1
N0 = 100
Q = get_dict()
N = get_dict()
K = 1000000
def mc(Q, N, N0=100, gamma=1):
flag = False
s = init()
env = Easy21()
#轨迹
trace = []
while not flag:
key = tuple(s)
a = epsilon_greedy(Q, N, key, N0)
s, r, flag = env.step(s, a)
trace.append((key, a, r))
#更新
N[key][a] += 1
G = 0
n = len(trace)
for i in range(n-1, -1, -1):
s, a, r = trace[i]
G = gamma * G + r
alpha = 1 / N[s][a]
Q[s][a] += alpha * (G - Q[s][a])
for i in range(K):
mc(Q, N)
#保存
with open("mcQ.pickle", "wb") as file:
pickle.dump(Q, file)
作图结果:
3 TD Learning in Easy21
伪代码如下:
import numpy as np
import pickle
import matplotlib.pyplot as plt
from env import *
def td(Q, N, lambda_, N0=100, gamma=1):
flag = False
s = init()
key = tuple(s)
a = epsilon_greedy(Q, N, key, N0)
env = Easy21()
E = get_dict()
while not flag:
key = tuple(s)
#执行动作
s, r, flag = env.step(s, a)
#选择下一步动作
a1 = epsilon_greedy(Q, N, key, N0)
key1 = tuple(s)
#更新计数
N[key][a] += 1
#更新E
E[key][a] += 1
#计算delta
delta = r - Q[key][a]
#判断是否终止
if not flag:
delta += gamma * Q[key1][a1]
#更新Q
alpha = 1 / N[key][a]
for key in Q:
Q[key] += alpha * delta * E[key]
E[key] *= gamma * lambda_
#赋值
a = a1
with open("mcQ.pickle", "rb") as file:
Q1 = pickle.load(file)
Lambda = np.linspace(0, 1, 11)
N0 = 100
K = 10000
Error = []
for lambda_ in Lambda:
Q = get_dict()
N = get_dict()
for i in range(K):
td(Q, N, lambda_)
error = error_square(Q, Q1)
Error.append(error)
if lambda_ == 0 or lambda_ == 1:
filename = "td" + str(int(lambda_)) + ".pickle"
with open(filename, "wb") as file:
pickle.dump(Q, file)
4 Linear Function Approximation in Easy21
利用第6节的公式,对前一部分的代码进行修改即可:
- 反向视角
import numpy as np
import pickle
import matplotlib.pyplot as plt
from env import *
def linear_td(theta, lambda_, epsilon=0.05, gamma=1, alpha=0.01):
#初始化
E = np.zeros_like(theta)
flag = False
s = init()
key = tuple(s)
a = greedy(theta, key, epsilon)
env = Easy21()
while not flag:
key = tuple(s)
#执行动作
s, r, flag = env.step(s, a)
#特征
f = get_feature(key, a)
#选择下一步动作
a1 = greedy(theta, key, epsilon)
key1 = tuple(s)
#特征
f1 = get_feature(key1, a1)
#计算delta
delta = r - np.dot(theta, f)
#判断是否终止
if not flag:
delta += gamma * np.dot(theta, f1)
E = gamma * lambda_ * E + f
theta += alpha * delta * E
a = a1
def computeQ(theta):
Q = dict()
X = np.arange(1, 11)
Y = np.arange(1, 22)
A = [0, 1]
for x in X:
for y in Y:
s = (x, y)
Q[s] = np.array([0.0, 0.0])
for a in A:
f = get_feature(s, a)
Q[s][a] = np.dot(theta, f)
return Q
with open("mcQ.pickle", "rb") as file:
Q1 = pickle.load(file)
Error = []
Lambda = np.linspace(0, 1, 11)
K = 10000
for lambda_ in Lambda:
theta = np.zeros(36)
for i in range(K):
linear_td(theta, lambda_)
Q = computeQ(theta)
error = error_square(Q, Q1)
Error.append(error)
if lambda_ == 0 or lambda_ == 1:
filename = "linear_td" + str(int(lambda_)) + ".pickle"
with open(filename, "wb") as file:
pickle.dump(Q, file)
补充
作图函数
import pickle
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from mpl_toolkits.mplot3d import Axes3D
def plot(filename, title):
with open(filename, "rb") as file:
Q = pickle.load(file)
X = np.arange(1, 11)
Y = np.arange(1, 22)
Z = np.zeros((21, 10))
for key in Q:
x = key[0]
y = key[1]
Z[y - 1][x - 1] = np.max(Q[key])
X, Y = np.meshgrid(X, Y)
fig = plt.figure()
ax = Axes3D(fig)
ax.plot_surface(X, Y, Z, cmap=cm.coolwarm)
ax.set_xlabel('Dealer showing')
ax.set_ylabel('Play sum')
ax.set_zlabel('value')
plt.title(title)
plt.show()
mc = "mcQ.pickle"
td0 = "td0.pickle"
td1 = "td1.pickle"
linear_td0 = "linear_td0.pickle"
linear_td1 = "linear_td1.pickle"
plot(mc, "mc")
plot(td0, "td0")
plot(td1, "td1")
plot(linear_td0, "linear_td0")
plot(linear_td1, "linear_td1")
本博客所有文章除特别声明外,均采用 CC BY-NC-SA 4.0 许可协议。转载请注明来自 Doraemonzzz!
评论
ValineLivere